data cleaning

titanic <- read_csv("titanic.csv")
visdat::vis_dat(titanic, palette = "cb_safe")+
  labs(title = "Raw Data Visualization")

# extract the title
t <- titanic %>% 
  extract(Name, into = "Title", reg =' ([A-Za-z]+)\\.')

# combine the high-ranking title
highrank <-   filter(t, !Title %in% c("Mr", "Mrs", "Miss", "Ms")) %>% 
    select(-Title) %>% 
    cbind(data.frame(Title = c(rep("HighRank",66))))
  
clean <- filter(t, Title %in% c("Mr", "Mrs", "Miss", "Ms")) %>% 
  rbind(highrank) %>% 
  arrange(PassengerId) %>% 
  select(-Cabin) %>% 
  na.omit() %>% 
  select(-Ticket, -PassengerId) %>% 
  mutate(Pclass = as.factor(Pclass),
         Survived = as.logical(Survived))

clean[356, "Title"] = "Miss"

split data by train 0.8, test 0.2

set.seed(5318)
sample <- sample(c(TRUE, FALSE), nrow(clean), replace=TRUE, prob=c(0.8,0.2))
train  <- clean[sample, ]
test   <- clean[!sample, ]

fit with all variables

model1 <- glm(Survived~., data = train, family = "binomial") 
  summary(model1)
## 
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6712  -0.5706  -0.3360   0.5577   2.5193  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  16.215065 614.240534   0.026 0.978939    
## Pclass2      -1.475395   0.400961  -3.680 0.000234 ***
## Pclass3      -2.649996   0.417732  -6.344 2.24e-10 ***
## TitleMiss   -11.605639 614.240465  -0.019 0.984925    
## TitleMr      -1.565998   0.429437  -3.647 0.000266 ***
## TitleMrs    -10.539407 614.240551  -0.017 0.986310    
## Sexmale     -12.626797 614.240528  -0.021 0.983599    
## Age          -0.054424   0.010949  -4.971 6.66e-07 ***
## SibSp        -0.498806   0.148219  -3.365 0.000765 ***
## Parch        -0.197488   0.152584  -1.294 0.195565    
## Fare          0.002177   0.002888   0.754 0.450934    
## EmbarkedQ    -0.811946   0.726763  -1.117 0.263905    
## EmbarkedS    -0.397705   0.322219  -1.234 0.217103    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 762.00  on 568  degrees of freedom
## Residual deviance: 465.89  on 556  degrees of freedom
## AIC: 491.89
## 
## Number of Fisher Scoring iterations: 13

check p-value of model

pchisq(762 - 456.89, 568-556, lower.tail = F)
## [1] 3.967624e-58

collinearity test

car::vif(model1)
##                  GVIF Df GVIF^(1/(2*Df))
## Pclass   2.354714e+00  2        1.238752
## Title    1.043554e+07  3       14.782656
## Sex      6.768920e+06  1     2601.714882
## Age      1.858304e+00  1        1.363196
## SibSp    1.341332e+00  1        1.158159
## Parch    1.410742e+00  1        1.187746
## Fare     1.608805e+00  1        1.268387
## Embarked 1.210050e+00  2        1.048820

Remove sex

model2 <- update(model1, .~. - Sex) 
summary(model2)
## 
## Call:
## glm(formula = Survived ~ Pclass + Title + Age + SibSp + Parch + 
##     Fare + Embarked, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6759  -0.5686  -0.3349   0.5597   2.5210  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.643551   0.667890   5.455 4.89e-08 ***
## Pclass2     -1.493393   0.400446  -3.729 0.000192 ***
## Pclass3     -2.664176   0.417739  -6.378 1.80e-10 ***
## TitleMiss    0.988152   0.435553   2.269 0.023285 *  
## TitleMr     -1.605136   0.424125  -3.785 0.000154 ***
## TitleMrs     2.060010   0.502259   4.101 4.11e-05 ***
## Age         -0.054614   0.010955  -4.985 6.19e-07 ***
## SibSp       -0.504292   0.148097  -3.405 0.000661 ***
## Parch       -0.201174   0.152773  -1.317 0.187899    
## Fare         0.002197   0.002897   0.758 0.448303    
## EmbarkedQ   -0.817465   0.726489  -1.125 0.260493    
## EmbarkedS   -0.396807   0.321876  -1.233 0.217653    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 762.00  on 568  degrees of freedom
## Residual deviance: 466.43  on 557  degrees of freedom
## AIC: 490.43
## 
## Number of Fisher Scoring iterations: 5
pchisq(762 - 466.43, 568-557, lower.tail = F)
## [1] 7.509407e-57
car::vif(model2)
##              GVIF Df GVIF^(1/(2*Df))
## Pclass   2.360578  2        1.239523
## Title    1.827415  3        1.105706
## Age      1.858486  1        1.363263
## SibSp    1.339123  1        1.157205
## Parch    1.409782  1        1.187342
## Fare     1.610194  1        1.268934
## Embarked 1.209175  2        1.048630
step(model2,direction = "backward", criteria = "AIC")
## Start:  AIC=490.43
## Survived ~ Pclass + Title + Age + SibSp + Parch + Fare + Embarked
## 
##            Df Deviance    AIC
## - Embarked  2   468.46 488.46
## - Fare      1   467.05 489.05
## - Parch     1   468.23 490.23
## <none>          466.43 490.43
## - SibSp     1   479.37 501.37
## - Age       1   494.61 516.61
## - Pclass    2   509.32 529.32
## - Title     3   625.52 643.52
## 
## Step:  AIC=488.46
## Survived ~ Pclass + Title + Age + SibSp + Parch + Fare
## 
##          Df Deviance    AIC
## - Fare    1   469.54 487.54
## - Parch   1   470.12 488.12
## <none>        468.46 488.46
## - SibSp   1   483.07 501.07
## - Age     1   499.30 517.30
## - Pclass  2   515.05 531.05
## - Title   3   627.84 641.84
## 
## Step:  AIC=487.54
## Survived ~ Pclass + Title + Age + SibSp + Parch
## 
##          Df Deviance    AIC
## - Parch   1   470.75 486.75
## <none>        469.54 487.54
## - SibSp   1   483.62 499.62
## - Age     1   502.34 518.34
## - Pclass  2   560.74 574.74
## - Title   3   629.71 641.71
## 
## Step:  AIC=486.75
## Survived ~ Pclass + Title + Age + SibSp
## 
##          Df Deviance    AIC
## <none>        470.75 486.75
## - SibSp   1   487.93 501.93
## - Age     1   502.50 516.50
## - Pclass  2   561.90 573.90
## - Title   3   636.47 646.47
## 
## Call:  glm(formula = Survived ~ Pclass + Title + Age + SibSp, family = "binomial", 
##     data = train)
## 
## Coefficients:
## (Intercept)      Pclass2      Pclass3    TitleMiss      TitleMr     TitleMrs  
##     3.48123     -1.74922     -2.98355      1.09904     -1.46017      2.03943  
##         Age        SibSp  
##    -0.05632     -0.55124  
## 
## Degrees of Freedom: 568 Total (i.e. Null);  561 Residual
## Null Deviance:       762 
## Residual Deviance: 470.7     AIC: 486.7
model5 <- update(model4, .~. -Parch)
summary(model5)
## 
## Call:
## glm(formula = Survived ~ Pclass + Title + Age + SibSp, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8057  -0.5646  -0.3238   0.5677   2.5301  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.48123    0.58099   5.992 2.07e-09 ***
## Pclass2     -1.74922    0.34621  -5.052 4.36e-07 ***
## Pclass3     -2.98355    0.34928  -8.542  < 2e-16 ***
## TitleMiss    1.09904    0.43195   2.544 0.010947 *  
## TitleMr     -1.46017    0.41604  -3.510 0.000449 ***
## TitleMrs     2.03943    0.48643   4.193 2.76e-05 ***
## Age         -0.05632    0.01069  -5.269 1.37e-07 ***
## SibSp       -0.55124    0.14214  -3.878 0.000105 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 762.00  on 568  degrees of freedom
## Residual deviance: 470.75  on 561  degrees of freedom
## AIC: 486.75
## 
## Number of Fisher Scoring iterations: 5
pchisq(762.00- 470.75, 568-561, lower.tail = F )
## [1] 4.465272e-59
predict(model2, test, type = "response") %>% 
  as.data.frame() %>% 
  rename("Prediction" = ".") %>% 
  mutate(pred.survival = (Prediction > 0.5)) %>% 
  cbind(test) %>% 
  select(pred.survival, Survived) %>% 
  mutate(accurate = (pred.survival == Survived)) %>% 
  group_by(accurate) %>% 
  count()
## # A tibble: 2 × 2
## # Groups:   accurate [2]
##   accurate     n
##   <lgl>    <int>
## 1 FALSE       25
## 2 TRUE       118
predict(model5, test, type = "response") %>% 
  as.data.frame() %>% 
  rename("Prediction" = ".") %>% 
  mutate(pred.survival = (Prediction > 0.5)) %>% 
  cbind(test) %>% 
  select(pred.survival, Survived) %>% 
  mutate(accurate = (pred.survival == Survived)) %>% 
  group_by(accurate) %>% 
  count()
## # A tibble: 2 × 2
## # Groups:   accurate [2]
##   accurate     n
##   <lgl>    <int>
## 1 FALSE       23
## 2 TRUE       120
# model 2
118/(25+118)
## [1] 0.8251748
# model 5 
120/(120+23)
## [1] 0.8391608

Comparing with AIC, model 5 got the smallest. As AIC refers to goodness of fit of the model and penalizing the complexity of the model. The smaller the AIC the better the model. In addition to the accuracy of prediction, model5 was chosen.

predict(model3, clean, type = "response") %>% 
  as.data.frame() %>% 
  rename("Prediction" = ".") %>% 
  mutate(pred.survival = (Prediction > 0.5)) %>% 
  cbind(clean) %>% 
  select(pred.survival, Survived) %>% 
  mutate(accurate = (pred.survival == Survived)) %>% 
  group_by(accurate) %>% 
  count()
## # A tibble: 2 × 2
## # Groups:   accurate [2]
##   accurate     n
##   <lgl>    <int>
## 1 FALSE      126
## 2 TRUE       586
predict(model3, clean, type = "response") %>% 
  as.data.frame() %>% 
  rename("Prediction" = ".") %>% 
  mutate(pred.survival = (Prediction > 0.5)) %>% 
  cbind(clean) %>% 
  select(pred.survival, Survived) %>% 
  mutate(accurate = (pred.survival == Survived)) %>% 
  group_by(accurate) %>% 
  count()
## # A tibble: 2 × 2
## # Groups:   accurate [2]
##   accurate     n
##   <lgl>    <int>
## 1 FALSE      126
## 2 TRUE       586
visdat::vis_dat(clean, palette = "cb_safe")+
  labs(title = "Cleaned Data Visualization")

clean %>% 
  select(Age, SibSp, Parch, Fare, Survived) %>% 
  cor() %>% 
corrplot::corrplot()

clean %>% 
ggplot(aes(x = Pclass, y = Fare, fill = Pclass))+
  geom_boxplot()+
  facet_wrap(~Pclass, scale ="free_y")+
  theme_bw() +
  scale_fill_brewer(palette = "Set1")

  ggplotly()
clean %>%
  group_by(Age, Sex, Survived) %>% 
  ggplot(aes(x = Age, fill = Survived))+
  geom_histogram()+
  facet_wrap(~Sex)+
  labs(title = "Number of surival by Age and Gender")+
  scale_fill_brewer(palette = "Set1")+
  theme_test()

clean %>%
  group_by(Pclass, Survived, Sex) %>% 
  count() %>% 
  ungroup() %>% 
  group_by(Pclass, Sex) %>% 
  mutate(surv_prop = n/sum(n)) %>% 
  ggplot(aes(x = Pclass, y = surv_prop, fill = Survived))+
  geom_col()+
  facet_wrap(~Sex)+
  labs(title = "Number of surival by Passenger Class and Gender",
       y = "Probability", x = "Class")+
  scale_fill_brewer(palette="Set1")+
  theme_test()

################### Data ###################
clean$Survived <- as.character(clean$Survived)

# Plot all variable, but there are too many categorical variable.
plot(clean)

# Histogram of variables
clean2 <- clean[,-c(2,3,4,9)] 
clean2$Survived <-as.numeric(clean2$Survived)

clean2 %>% 
  select(-Survived) %>% 
  PerformanceAnalytics::chart.Correlation()

# Age & Sex histogram
ggplot(data=clean, mapping =aes (x=Age, fill=Sex)) +
  geom_histogram(binwidth=5)+facet_wrap(~Survived)

# Fare & Sex histogram
ggplot(data=clean, mapping =aes (x=Fare, fill=Sex)) +
  geom_histogram(binwidth=20)+facet_wrap(~Survived)

# SibSp histogram
ggplot(data=clean, mapping =aes (x=SibSp, fill=Survived)) +
  geom_histogram(binwidth=1)

# Title barplot
ggplot(clean, aes(x=Title, fill=Survived)) +geom_bar()

# Embarked barplot
ggplot(clean, aes(x=Embarked, fill=Survived)) +geom_bar(position='dodge')

# Parch barplot
ggplot(clean, aes(x=Parch, fill=Survived)) +geom_bar()

################### Model 5 ###################

plot(model5$fitted.values, model5$residuals) 

plot(train$Age, model5$residuals) 
identify(train$Age , model5$residuals) 

## integer(0)
plot(train$Fare , model5$residuals) 
identify(train$Fare , model5$residuals) 

## integer(0)
plot(train$SibSp , model5$residuals) 

plot(train$Parch , model5$residuals)

# QQplot (useless for logistic regression)
qqnorm(model5$residuals)
qqline(model5$residuals)

# Residual Histogram
hist(model5$residuals)

# Residual plot 
# We discovered 3 outlier which are no. obs 202, 266, 323
plot(model5$fitted.values, model5$residuals) 
identify(model5$fitted.values, model5$residuals) 

## integer(0)
train[c(202,266,323),]
## # A tibble: 3 × 9
##   Survived Pclass Title Sex      Age SibSp Parch   Fare Embarked
##   <lgl>    <fct>  <chr> <chr>  <dbl> <dbl> <dbl>  <dbl> <chr>   
## 1 FALSE    1      Miss  female     2     1     2 152.   S       
## 2 TRUE     3      Mr    male      39     0     0   7.92 S       
## 3 FALSE    1      Mrs   female    25     1     2 152.   S
# obs 202 passengers ID = 298
# Name: Allison, Miss. Helen Loraine
# Age: 2
# Fare: 152
# Survived: 0
p_298 <- titanic %>% filter (PassengerId == 298)
p_298
## # A tibble: 1 × 12
##   PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
##         <dbl>    <dbl>  <dbl> <chr>   <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
## 1         298        0      1 Alliso… fema…     2     1     2 113781  152. C22 …
## # … with 1 more variable: Embarked <chr>
# obs 266 passengers ID = 401
p_401 <- titanic %>% filter (PassengerId == 401)
View(p_401)

# obs 323 passengers ID = 499
p_499 <- titanic %>% filter (PassengerId == 499)
View(p_298)
# Age & Sex histogram
ggplot(data=clean, mapping =aes (x=Age, fill=Sex)) +
  geom_histogram(binwidth=5)+facet_wrap(~Survived) +  
  ggtitle("Survivor by Age & Gender")+
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set1")

clean$Survived = as.factor(clean$Survived)
# Fare & Sex histogram
clean %>% 
ggplot(aes(x = Survived, y = Fare, fill = Sex))+
  geom_boxplot()+
  theme_bw() +
  ggtitle("Survivors by Gender & Fare")+
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set1")

# Title barplot
ggplot(clean, aes(x=Title, fill=Survived)) +geom_bar()+
  ggtitle("Survivors' Title")+
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set1")

clean %>%
  group_by(SibSp, Survived, Sex) %>% 
  count() %>% 
  ungroup() %>% 
  group_by(SibSp, Sex) %>% 
  mutate(surv_prop = n/sum(n)) %>% 
  ggplot(aes(x = SibSp, y = surv_prop, fill = Survived))+
  geom_col()+
  facet_wrap(~Sex)+
  labs(title = "No. of survivors by Gender and no. siblings/spouses on board",
       y = "Probability", x = "No. of siblings/spouses on board")+
  scale_fill_brewer(palette="Set1")+
  theme_test()

clean %>%
  group_by(Parch, Survived, Sex) %>% 
  count() %>% 
  ungroup() %>% 
  group_by(Parch, Sex) %>% 
  mutate(surv_prop = n/sum(n)) %>% 
  ggplot(aes(x = Parch, y = surv_prop, fill = Survived))+
  geom_col()+
  facet_wrap(~Sex)+
  labs(title = "No. of survivors by Gender and no. of parents/children on board",
       y = "Probability", x = "No. of parents/children on board")+
  scale_fill_brewer(palette="Set1")+
  theme_test()

################### Model 5 ###################

plot(model5$fitted.values, model5$residuals) 

plot(train$Age, model5$residuals) 
identify(train$Age , model5$residuals) 

## integer(0)
plot(train$Fare , model5$residuals) 
identify(train$Fare , model5$residuals) 

## integer(0)
plot(train$SibSp , model5$residuals) 

plot(train$Parch , model5$residuals)

# QQplot (useless for logistic regression)
qqnorm(model5$residuals)
qqline(model5$residuals)

# Residual Histogram
hist(model5$residuals)

# Residual plot 
# We discovered 3 outlier which are no. obs 202, 266, 323
plot(model5$fitted.values, model5$residuals) 
identify(model5$fitted.values, model5$residuals) 

## integer(0)
train[c(202,266,323),]
## # A tibble: 3 × 9
##   Survived Pclass Title Sex      Age SibSp Parch   Fare Embarked
##   <lgl>    <fct>  <chr> <chr>  <dbl> <dbl> <dbl>  <dbl> <chr>   
## 1 FALSE    1      Miss  female     2     1     2 152.   S       
## 2 TRUE     3      Mr    male      39     0     0   7.92 S       
## 3 FALSE    1      Mrs   female    25     1     2 152.   S
# obs 202 passengers ID = 298
# Name: Allison, Miss. Helen Loraine
# Age: 2
# Fare: 152
# Survived: 0